In [1]:
import pandas as pd
import numpy as np
files = ["ap_2010.csv", "class_size.csv", "demographics.csv", "graduation.csv", "hs_directory.csv", "math_test_results.csv", "sat_results.csv"]
data = {}
for item in files:
d = pd.read_csv("Documents/DS/schools2/{0}".format(item))
data[item.replace(".csv","")] = d
In [2]:
for k,v in data.items():
print('\n' + k + '\n')
print(v.head())
In [3]:
data["class_size"]["DBN"] = data["class_size"].apply(lambda x: "{0:02d}{1}".format(x["CSD"], x["SCHOOL CODE"]), axis = 1)
#by setting axis = 1, we are applying our lambda function on each row (row-wise). So for every row, apply lambda. What is lambda?
#lambda takes the values in the CSD column and SCHOOL CODE column and appends them. This is done row by row, for all rows.
#The {0:02d}{1} is simple. It is equivalent to saying {0}{1}. The 02d just says, if it is a single digit value like "1" then put
# a "0" infront of the "1" (i.e make it 2digits). {0:03d} would mean 3 digits --> if x["CSD"] = 1, then 001
#{0:04d} would mean 4 digits --> if x["CSD"] = 1, then 0001; if x["CSD"] = 10, then 0010; if x["CSD"] = 100, then 0100, etc.
In [4]:
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]
In [5]:
data["hs_directory"].head()
Out[5]:
In [6]:
survey1 = pd.read_csv("Documents/DS/schools2/survey_75.txt", delimiter = '\t', encoding = "windows-1252")
survey2 = pd.read_csv("Documents/DS/schools2/survey_full.txt", delimiter = '\t', encoding = "windows-1252")
survey1['d75'] = True
survey2['d75'] = False
survey = pd.concat([survey1, survey2])
In [7]:
survey.head()
Out[7]:
In [8]:
survey["DBN"] = survey["dbn"]
survey_heads = ["DBN", "rr_s", "rr_t", "rr_p", "N_p", "N_s", "N_t", "saf_p_11", "com_p_11", "eng_p_11", "aca_p_11", "saf_t_11", "com_t_11", "eng_t_10", "aca_t_11", "saf_s_11", "com_s_11", "eng_s_11", "aca_s_11", "saf_tot_11", "com_tot_11", "eng_tot_11", "aca_tot_11"]
survey = survey.loc[:,survey_heads]
survey.head()
data["survey"] = survey
In [9]:
data["survey"].shape
Out[9]:
In [10]:
class_size = data['class_size']
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
class_size = class_size.groupby("DBN").agg(np.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
In [11]:
class_size.head()
Out[11]:
In [12]:
demographics = data["demographics"]
demographics = demographics[demographics["schoolyear"] == 20112012]
data["demographics"] = demographics
In [13]:
math = data["math_test_results"]
math = math[(math["Grade"] == "8")]
math = math[(math["Year"] == 2011)]
data["math_test_results"] = math
In [14]:
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]
data["graduation"] = data['graduation'][data["graduation"]["Cohort"] == "2006"]
In [15]:
data["sat_results"].head()
Out[15]:
In [16]:
sat_cols = ["SAT Critical Reading Avg. Score", "SAT Math Avg. Score", "SAT Writing Avg. Score"]
for val in sat_cols:
data["sat_results"][val] = data["sat_results"][val].convert_objects(convert_numeric=True)
data["sat_results"]["sat_total"] = data["sat_results"][sat_cols[0]] + data["sat_results"][sat_cols[1]] + data["sat_results"][sat_cols[2]]
In [17]:
lat = data["hs_directory"]["Location 1"].apply(lambda x: x.split("\n")[-1].replace("(","").replace(")","").split(",")[0])
lon = data["hs_directory"]["Location 1"].apply(lambda x: x.split("\n")[-1].replace("(","").replace(")","").split(",")[1])
# Now we are applying column wise (if we dont specify the axis it is assumed axis = 0 which means columnwise). For every element
# under the column Location 1 parse the value every time we have "\n" and put all the seperate parts in a list, then
# take the last element of this list which is (lat, lon) and then further clean.
In [18]:
lat.dtypes
Out[18]:
In [19]:
data["hs_directory"]["lat"] = lat.convert_objects(convert_numeric = True)
data["hs_directory"]["lon"] = lon.convert_objects(convert_numeric = True)
In [20]:
for k,v in data.items():
print("\n" + k + "\n")
print(v.head())
print(v.shape)
In [21]:
flat_data_names = [k for k,v in data.items()]
flat_data_names[3] = "d"
flat_data_names[0] = "math_test_results"
flat_data_names[3] = "graduation"
flat_data_names
Out[21]:
In [22]:
flat_data = [data[k] for k in flat_data_names]
full = flat_data[0]
for i, f in enumerate(flat_data[1:]):
name = flat_data_names[i+1]
print(name)
print(len(f["DBN"]) - len(f["DBN"].unique()))
join_type = "inner"
if name in ["sat_results", "graduation", "ap_2010"]:
join_type = "outer"
if name not in ["math_test_results"]:
full = full.merge(f, on="DBN", how=join_type)
full.shape
Out[22]:
In [23]:
ap_cols = ["AP Test Takers ", "Total Exams Taken", "Number of Exams with scores 3 4 or 5"]
for col in ap_cols:
full[col] = full[col].convert_objects(convert_numeric = True)
full[ap_cols] = full[ap_cols].fillna(value=0)
In [24]:
full["school_dist"] = full["DBN"].apply(lambda x: x[:2])
In [25]:
full = full.fillna(full.mean())
In [26]:
full.corr()["sat_total"]
Out[26]:
In [36]:
import folium
from folium import plugins
base_map = folium.Map(location=[full['lat'].mean(), full['lon'].mean()], zoom_start = 10)
marker_cluster = folium.MarkerCluster().add_to(base_map)
for name, row in full.iterrows():
folium.Marker([row["lat"], row["lon"]], popup="{0}: {1}".format(row["DBN"], row["school_name"])).add_to(marker_cluster)
base_map.create_map('schools.html')
base_map
Out[36]:
In [38]:
schools_heatmap = folium.Map(location=[full["lat"].mean(), full["lon"].mean()], zoom_start=10)
schools_heatmap.add_children(plugins.HeatMap([[row['lat'], row['lon']] for name, row in full.iterrows()]))
schools_heatmap.save("heatmap.html")
schools_heatmap
Out[38]:
In [42]:
district_group = full.groupby('school_dist').agg(np.mean)
district_group.reset_index(inplace=True)
district_group['school_dist'] = district_group['school_dist'].apply(lambda x: str(int(x)))
In [43]:
district_group.head()
Out[43]:
In [58]:
def build_district_map(col):
geo_path = "Documents/DS/schools2/School Districts.geojson"
districts = folium.Map(location=[full['lat'].mean(), full['lon'].mean()], zoom_start=10)
districts.geo_json(
geo_path=geo_path,
data=district_group,
columns=["school_dist", col],
key_on='feature.properties.school_dist',
fill_color='YlGn',
line_opacity=0.2,
fill_opacity=0.7
)
districts.save("district.html")
return districts
build_district_map('sat_total')
Out[58]:
In [48]:
%matplotlib inline
full.plot.scatter(x="total_enrollment", y="sat_total")
Out[48]:
In [54]:
full[(full['sat_total'] < 1000) & (full["total_enrollment"] < 1000)]["School Name"]
Out[54]:
In [55]:
full.plot.scatter(x='ell_percent', y='sat_total')
Out[55]:
In [59]:
build_district_map('ell_percent')
Out[59]:
In [66]:
full.corr()["sat_total"][("rr_s, rr_t, rr_p, N_p, N_s, N_t, saf_tot_11, com_tot_11, eng_tot_11, aca_tot_11").split(", ")].plot.bar()
Out[66]:
In [73]:
full.corr()["sat_total"][["asian_per", "black_per", "hispanic_per", "white_per"]].plot.bar()
Out[73]:
In [74]:
full.corr()["ell_percent"]["hispanic_per"]
Out[74]:
In [75]:
full.plot.scatter(x="ell_percent", y="hispanic_per")
Out[75]:
In [76]:
build_district_map("hispanic_per")
Out[76]:
In [78]:
full.corr()["sat_total"][["male_per", "female_per"]].plot.bar()
Out[78]:
In [79]:
full.plot.scatter(x="female_per", y="sat_total")
Out[79]:
In [82]:
full[(full["sat_total"] > 1400) & (full["female_per"] > 65)]["School Name"]
Out[82]:
In [83]:
full["avg_ap"] = full["AP Test Takers "] / full["total_enrollment"]
In [84]:
full.corr()["sat_total"]["avg_ap"]
Out[84]:
In [85]:
full.plot.scatter(x="avg_ap", y='sat_total')
Out[85]:
In [87]:
full[(full['avg_ap'] > 0.3) & (full['sat_total'] > 1700)]["School Name"]
Out[87]:
In [ ]: